AAQoL machine learning analysis with unbalanced random forest

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above")))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 
rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 2187
           Frequency of class labels: 1069, 1118
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.8583
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1382
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0458
                   (OOB) Brier score: 0.23103992
        (OOB) Normalized Brier score: 0.92415967
                           (OOB) AUC: 0.65240365
                        (OOB) PR-AUC: 0.61641433
                        (OOB) G-mean: 0.60585351
   (OOB) Requested performance error: 0.39414649

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  711 358      0.3349
       Yes 501 617      0.4481

      (OOB) Misclassification rate: 0.3927755
print(rfobj)
                         Sample size: 2187
           Frequency of class labels: 1069, 1118
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.8583
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1382
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0458
                   (OOB) Brier score: 0.23103992
        (OOB) Normalized Brier score: 0.92415967
                           (OOB) AUC: 0.65240365
                        (OOB) PR-AUC: 0.61641433
                        (OOB) G-mean: 0.60585351
   (OOB) Requested performance error: 0.39414649

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  711 358      0.3349
       Yes 501 617      0.4481

      (OOB) Misclassification rate: 0.3927755
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
Age                        0.0293   NA    NA
Ethnicity                  0.0262   NA    NA
Similar Values             0.0073   NA    NA
Close-knit Community       0.0061   NA    NA
Spend Time Together        0.0057   NA    NA
Religion                   0.0045   NA    NA
Community Shares Values    0.0039   NA    NA
Close Friends              0.0036   NA    NA
Helpful Community          0.0029   NA    NA
EnglishDiff                0.0026   NA    NA
Community Trust            0.0025   NA    NA
Togetherness               0.0020   NA    NA
Trust                      0.0018   NA    NA
Get Along                  0.0017   NA    NA
Religious Importance       0.0013   NA    NA
Family Pride               0.0011   NA    NA
EnglishSpeak               0.0010   NA    NA
Close Family              -0.0005   NA    NA
See Friends               -0.0005   NA    NA
Loyalty                   -0.0009   NA    NA
Gender                    -0.0013   NA    NA
See Family                -0.0018   NA    NA
Family Respect            -0.0018   NA    NA
Expression                -0.0020   NA    NA
Feel Close                -0.0020   NA    NA
Helpful Family            -0.0021   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                0.0262227715 NA  NA
Age                      0.0293419950 NA  NA
Gender                  -0.0013429744 NA  NA
Religion                 0.0045072860 NA  NA
Employment              -0.0053062773 NA  NA
Income_median           -0.0035362445 NA  NA
EnglishSpeak             0.0009702818 NA  NA
EnglishDiff              0.0026345779 NA  NA
See Family              -0.0017686292 NA  NA
Close Family            -0.0004748989 NA  NA
Helpful Family          -0.0021238020 NA  NA
See Friends             -0.0005031878 NA  NA
Close Friends            0.0036216084 NA  NA
Helpful Friends         -0.0037318723 NA  NA
Family Respect          -0.0018223314 NA  NA
Similar Values           0.0072965800 NA  NA
Successful Family       -0.0032018158 NA  NA
Trust                    0.0017544161 NA  NA
Loyalty                 -0.0008508267 NA  NA
Family Pride             0.0010968648 NA  NA
Expression              -0.0019606961 NA  NA
Spend Time Together      0.0056799988 NA  NA
Feel Close              -0.0020219520 NA  NA
Togetherness             0.0020134801 NA  NA
Religious Attendance    -0.0031825823 NA  NA
Religious Importance     0.0013431897 NA  NA
Close-knit Community     0.0060607174 NA  NA
Helpful Community        0.0028718174 NA  NA
Community Shares Values  0.0039405524 NA  NA
Get Along                0.0017066330 NA  NA
Community Trust          0.0025014317 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Source of Information: Health Professionals

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728
rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() 

imbalanced(`Heal Professionals` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")->rfobj

print(rfobj)
                         Sample size: 2188
           Frequency of class labels: 1067, 1121
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 482.3017
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1383
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0506
                   (OOB) Brier score: 0.2315831
        (OOB) Normalized Brier score: 0.92633239
                           (OOB) AUC: 0.65696526
                        (OOB) PR-AUC: 0.62672582
                        (OOB) G-mean: 0.61389953
   (OOB) Requested performance error: 0.38610047

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  660 407      0.3814
       Yes 438 683      0.3907

      (OOB) Misclassification rate: 0.3861974
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
EnglishSpeak               0.0238   NA    NA
Close Friends              0.0091   NA    NA
Religious Attendance       0.0063   NA    NA
Helpful Community          0.0061   NA    NA
See Friends                0.0049   NA    NA
Income_median              0.0040   NA    NA
Close Family               0.0023   NA    NA
Community Trust            0.0018   NA    NA
See Family                 0.0013   NA    NA
Community Shares Values    0.0010   NA    NA
Get Along                  0.0006   NA    NA
Employment                 0.0004   NA    NA
Similar Values             0.0000   NA    NA
EnglishDiff               -0.0001   NA    NA
Feel Close                -0.0004   NA    NA
Spend Time Together       -0.0005   NA    NA
Trust                     -0.0005   NA    NA
Family Respect            -0.0014   NA    NA
Age                       -0.0014   NA    NA
Gender                    -0.0014   NA    NA
Expression                -0.0018   NA    NA
Loyalty                   -0.0028   NA    NA
Togetherness              -0.0033   NA    NA
Close-knit Community      -0.0037   NA    NA
Helpful Friends           -0.0037   NA    NA
Religious Importance      -0.0037   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -4.967040e-03 NA  NA
Age                     -1.406553e-03 NA  NA
Gender                  -1.418102e-03 NA  NA
Religion                -4.031489e-03 NA  NA
Employment               4.223221e-04 NA  NA
Income_median            4.030721e-03 NA  NA
EnglishSpeak             2.379387e-02 NA  NA
EnglishDiff             -7.625946e-05 NA  NA
See Family               1.336760e-03 NA  NA
Close Family             2.329798e-03 NA  NA
Helpful Family          -6.385242e-03 NA  NA
See Friends              4.939594e-03 NA  NA
Close Friends            9.055062e-03 NA  NA
Helpful Friends         -3.743238e-03 NA  NA
Family Respect          -1.393644e-03 NA  NA
Similar Values           1.634255e-05 NA  NA
Successful Family       -4.137012e-03 NA  NA
Trust                   -4.648994e-04 NA  NA
Loyalty                 -2.784139e-03 NA  NA
Family Pride            -6.775320e-03 NA  NA
Expression              -1.843914e-03 NA  NA
Spend Time Together     -4.648994e-04 NA  NA
Feel Close              -3.730365e-04 NA  NA
Togetherness            -3.272003e-03 NA  NA
Religious Attendance     6.310613e-03 NA  NA
Religious Importance    -3.747298e-03 NA  NA
Close-knit Community    -3.657954e-03 NA  NA
Helpful Community        6.076049e-03 NA  NA
Community Shares Values  9.881442e-04 NA  NA
Get Along                5.879244e-04 NA  NA
Community Trust          1.845371e-03 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Health Insurance` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2189
           Frequency of class labels: 292, 1897
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 258.6077
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1383
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.4966
                   (OOB) Brier score: 0.10366262
        (OOB) Normalized Brier score: 0.41465048
                           (OOB) AUC: 0.74897639
                        (OOB) PR-AUC: 0.33153441
                        (OOB) G-mean: 0.66637896
   (OOB) Requested performance error: 0.33362104

Confusion matrix:

          predicted
  observed   0  Yes class.error
       0   222   70      0.2397
       Yes 789 1108      0.4159

      (OOB) Misclassification rate: 0.3924166
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1897.0000000  292.0000000    6.4965753    0.1333942    0.7602740    0.5840801 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2195846    0.9405772    0.3924166    0.1036626    0.4146505    0.7489764 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3407521    0.4627145    0.1333942    0.3315344    0.5035655    0.5645467 
       gmean 
   0.6663790 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Dental Insurance` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2184
           Frequency of class labels: 849, 1335
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 397.4423
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1380
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.5724
                   (OOB) Brier score: 0.17748606
        (OOB) Normalized Brier score: 0.70994423
                           (OOB) AUC: 0.79696757
                        (OOB) PR-AUC: 0.70071511
                        (OOB) G-mean: 0.73335655
   (OOB) Requested performance error: 0.26664345

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   643 206      0.2426
       Yes 387 948      0.2899

      (OOB) Misclassification rate: 0.2715201
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1335.0000000  849.0000000    1.5724382    0.3887363    0.7573616    0.7101124 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6242718    0.8214905    0.2715201    0.1774861    0.7099442    0.7969676 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6844066    0.7210108    0.3887363    0.7007151    0.7088816    0.7271837 
       gmean 
   0.7333566 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Physical Check-up` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2178
           Frequency of class labels: 704, 1474
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 425.4703
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1376
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 2.0938
                   (OOB) Brier score: 0.19880312
        (OOB) Normalized Brier score: 0.7952125
                           (OOB) AUC: 0.68752313
                        (OOB) PR-AUC: 0.49471621
                        (OOB) G-mean: 0.63669065
   (OOB) Requested performance error: 0.36330935

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   488 216      0.3068
       Yes 612 862      0.4152

      (OOB) Misclassification rate: 0.3801653
plot(imb,plots.one.page = F)


                              all    0   Yes
Age                        0.0573   NA    NA
Income_median              0.0223   NA    NA
Ethnicity                  0.0103   NA    NA
Employment                 0.0085   NA    NA
Togetherness               0.0020   NA    NA
Community Trust            0.0012   NA    NA
Helpful Friends            0.0007   NA    NA
Close Family               0.0007   NA    NA
Religion                   0.0003   NA    NA
Feel Close                -0.0005   NA    NA
Expression                -0.0011   NA    NA
See Family                -0.0012   NA    NA
Similar Values            -0.0016   NA    NA
EnglishSpeak              -0.0017   NA    NA
Religious Importance      -0.0031   NA    NA
Loyalty                   -0.0031   NA    NA
Gender                    -0.0038   NA    NA
Helpful Family            -0.0041   NA    NA
EnglishDiff               -0.0050   NA    NA
Community Shares Values   -0.0051   NA    NA
Helpful Community         -0.0055   NA    NA
Family Pride              -0.0056   NA    NA
Spend Time Together       -0.0059   NA    NA
See Friends               -0.0059   NA    NA
Close-knit Community      -0.0062   NA    NA
Family Respect            -0.0068   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1474.0000000  704.0000000    2.0937500    0.3232323    0.6931818    0.5848033 
        prec          npv     misclass        brier   brier.norm          auc 
   0.4436364    0.7996289    0.3801653    0.1988031    0.7952125    0.6875231 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.5410200    0.6008462    0.3232323    0.4947162    0.5888553    0.6187684 
       gmean 
   0.6366907 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Dentist Check-up` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2175
           Frequency of class labels: 896, 1279
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 450.7223
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1375
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.4275
                   (OOB) Brier score: 0.21277143
        (OOB) Normalized Brier score: 0.85108573
                           (OOB) AUC: 0.70613595
                        (OOB) PR-AUC: 0.60028161
                        (OOB) G-mean: 0.65337547
   (OOB) Requested performance error: 0.34662453

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   610 286      0.3192
       Yes 477 802      0.3729

      (OOB) Misclassification rate: 0.3508046
plot(imb,plots.one.page = F)


                           all    0   Yes
EnglishSpeak            0.0192   NA    NA
Age                     0.0122   NA    NA
Gender                  0.0032   NA    NA
Helpful Friends         0.0031   NA    NA
Religious Importance    0.0023   NA    NA
Ethnicity               0.0016   NA    NA
Income_median           0.0014   NA    NA
Helpful Family          0.0011   NA    NA
EnglishDiff             0.0008   NA    NA
Feel Close              0.0007   NA    NA
Close Friends           0.0007   NA    NA
See Friends             0.0005   NA    NA
Close Family            0.0005   NA    NA
Employment              0.0001   NA    NA
Community Trust        -0.0002   NA    NA
Spend Time Together    -0.0002   NA    NA
Trust                  -0.0005   NA    NA
Religion               -0.0005   NA    NA
See Family             -0.0013   NA    NA
Get Along              -0.0017   NA    NA
Religious Attendance   -0.0023   NA    NA
Loyalty                -0.0028   NA    NA
Family Pride           -0.0036   NA    NA
Successful Family      -0.0044   NA    NA
Togetherness           -0.0045   NA    NA
Expression             -0.0049   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1279.0000000  896.0000000    1.4274554    0.4119540    0.6808036    0.6270524 
        prec          npv     misclass        brier   brier.norm          auc 
   0.5611776    0.7371324    0.3508046    0.2127714    0.8510857    0.7061359 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6152295    0.6449334    0.4119540    0.6002816    0.6343025    0.6491544 
       gmean 
   0.6533755 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Urgent Care

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2167
           Frequency of class labels: 1808, 359
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 326.4887
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1370
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 5.0362
                   (OOB) Brier score: 0.1369538
        (OOB) Normalized Brier score: 0.54781519
                           (OOB) AUC: 0.59534612
                        (OOB) PR-AUC: 0.23587161
                        (OOB) G-mean: 0.5668512
   (OOB) Requested performance error: 0.4331488

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   880 928      0.5133
       Yes 122 237      0.3398

      (OOB) Misclassification rate: 0.4845408
plot(imb,plots.one.page = F)


                          all    0   Yes
Close Family           0.0370   NA    NA
Ethnicity              0.0241   NA    NA
Age                    0.0218   NA    NA
Close Friends          0.0195   NA    NA
Loyalty                0.0195   NA    NA
Trust                  0.0190   NA    NA
EnglishDiff            0.0187   NA    NA
Helpful Family         0.0186   NA    NA
Religion               0.0164   NA    NA
Family Respect         0.0162   NA    NA
EnglishSpeak           0.0154   NA    NA
Religious Attendance   0.0152   NA    NA
Spend Time Together    0.0146   NA    NA
Feel Close             0.0143   NA    NA
Community Trust        0.0132   NA    NA
Family Pride           0.0125   NA    NA
See Friends            0.0124   NA    NA
Helpful Friends        0.0123   NA    NA
Helpful Community      0.0120   NA    NA
Togetherness           0.0113   NA    NA
Close-knit Community   0.0107   NA    NA
Income_median          0.0107   NA    NA
Religious Importance   0.0089   NA    NA
Expression             0.0087   NA    NA
Similar Values         0.0085   NA    NA
See Family             0.0083   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1808.0000000  359.0000000    5.0362117    0.1656668    0.6601671    0.4867257 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2034335    0.8782435    0.4845408    0.1369538    0.5478152    0.5953461 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3110236    0.4156465    0.1656668    0.2358716    0.4389374    0.4912489 
       gmean 
   0.5668512 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Folkmedicine` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2152
           Frequency of class labels: 1866, 286
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 271.6277
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1360
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.5245
                   (OOB) Brier score: 0.11210216
        (OOB) Normalized Brier score: 0.44840865
                           (OOB) AUC: 0.66744617
                        (OOB) PR-AUC: 0.21621988
                        (OOB) G-mean: 0.62839779
   (OOB) Requested performance error: 0.37160221

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0   1028 838      0.4491
       Yes   81 205      0.2832

      (OOB) Misclassification rate: 0.4270446
plot(imb,plots.one.page = F)


                             all    0   Yes
Age                       0.0504   NA    NA
Ethnicity                 0.0468   NA    NA
EnglishSpeak              0.0196   NA    NA
EnglishDiff               0.0178   NA    NA
Employment                0.0132   NA    NA
Helpful Friends           0.0117   NA    NA
Family Pride              0.0111   NA    NA
Trust                     0.0095   NA    NA
Feel Close                0.0079   NA    NA
Community Trust           0.0068   NA    NA
Expression                0.0066   NA    NA
Helpful Community         0.0061   NA    NA
Income_median             0.0061   NA    NA
Close Friends             0.0049   NA    NA
Loyalty                   0.0047   NA    NA
Close-knit Community      0.0046   NA    NA
Togetherness              0.0046   NA    NA
Community Shares Values   0.0044   NA    NA
See Friends               0.0043   NA    NA
Religious Attendance      0.0040   NA    NA
Successful Family         0.0037   NA    NA
Similar Values            0.0035   NA    NA
Religion                  0.0032   NA    NA
Get Along                 0.0031   NA    NA
Family Respect            0.0026   NA    NA
Religious Importance      0.0022   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1866.0000000  286.0000000    6.5244755    0.1328996    0.7167832    0.5509110 
        prec          npv     misclass        brier   brier.norm          auc 
   0.1965484    0.9269612    0.4270446    0.1121022    0.4484087    0.6674462 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3085026    0.4265804    0.1328996    0.2162199    0.4684502    0.5274891 
       gmean 
   0.6283978 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)